RESUME DES VARIABLES D’INTERET:
data_HEV = read.csv2("HEV_metadonneesFiltrees.csv")
head(data_HEV[,1:5 ])
## ID ACCESSION SOURCE ORGANISM
## 1 2574440105 OR224867\n Hepeviridae sp.\n Hepeviridae sp.\n
## 2 2553477909 MZ751062\n Paslahepevirus balayani\n Paslahepevirus balayani\n
## 3 2553477905 MZ751061\n Paslahepevirus balayani\n Paslahepevirus balayani\n
## 4 2550033136 LC704570\n Paslahepevirus balayani\n Paslahepevirus balayani\n
## 5 2550033132 LC704569\n Paslahepevirus balayani\n Paslahepevirus balayani\n
## 6 2528474488 OP909736\n Hepeviridae sp.\n Hepeviridae sp.\n
## AUTHORS
## 1 Liu,T., Yuan,D. and Wang,L.\n
## 2 Liang,C., Zhao,C., Huang,W., Wang,Y. and She,R.\n
## 3 Liang,C., Zhao,C., Huang,W., Wang,Y. and She,R.\n
## 4 Takahashi,M., Kunita,S., Nishizawa,T., Ohnishi,H.,\n
## 5 Takahashi,M., Kunita,S., Nishizawa,T., Ohnishi,H.,\n
## 6 Klink,P., Harms,D., Altmann,B., Dorffel,Y., Morgera,U., Zander,S.,\n
head(data_HEV[,6:10 ])
## TITLE
## 1 Direct Submission\n
## 2 Direct Submission\n
## 3 Direct Submission\n
## 4 Infection Dynamics and Genomic Mutations of Hepatitis E Virus in\n
## 5 Infection Dynamics and Genomic Mutations of Hepatitis E Virus in\n
## 6 Molecular characterisation of a rabbit Hepatitis E Virus strain\n
## Assembly.Method Sequencing.Technology Location.Qualifiers
## 1 MobaXterm v. 2022.1.6\n PacBio\n 1..7296\n
## 2 spades v. 3.0.1\n Illumina\n 1..7331\n
## 3 spades v. v3.0.1\n Illumina\n 1..7311\n
## 4 <NA> <NA> 1..7247\n
## 5 <NA> <NA> 1..7246\n
## 6 bwa v. 0.7.15\n Illumina\n 1..7282\n
## genotype
## 1 <NA>
## 2 <NA>
## 3 <NA>
## 4 genotype 3 (subtype 3b)"\n
## 5 genotype 3 (subtype 3b)"\n
## 6 <NA>
data_HEV$ORF = factor(data_HEV$ORF)
for (i in c("Assembly.Method", "Sequencing.Technology", "genotype", "isolation_source", "host", "country")){
data_HEV[, i] = factor(data_HEV[, i])
}
summary(data_HEV[, c("Assembly.Method", "Sequencing.Technology", "genotype", "isolation_source", "host", "country", "ORF")])
## Assembly.Method Sequencing.Technology
## Tanoti v. 1\n : 40 Sanger dideoxy sequencing\n :144
## Geneious v. 2021\n: 24 Illumina\n :115
## BioEdit v. 7.2.5\n: 8 Illumina; Sanger dideoxy sequencing\n: 20
## Bowtie v. 2\n : 8 IonTorrent\n : 6
## IDBA-UD v. 1.1.1\n: 8 454\n : 4
## (Other) : 90 (Other) : 13
## NA's :898 NA's :774
## genotype isolation_source host
## genotype 3\n :108 "blood" :297 "Homo sapiens":488
## genotype 3"\n : 24 "liver" : 82 "swine" : 44
## genotype 4"\n : 14 "feces" : 46 "rabbit" : 39
## genotype 4, isolate:\n: 10 "serum" : 42 "Sus scrofa" : 20
## genotype 3, isolate:\n: 7 "plasma": 41 "wild boar" : 18
## (Other) : 91 (Other) :198 (Other) :201
## NA's :822 NA's :370 NA's :266
## country ORF
## "France" :316 "ORF4": 2
## "China" :116 ORF1 :236
## "Japan:Hokkaido, Sapporo": 78 ORF2 : 1
## "Japan" : 27 ORF3 : 5
## "Japan:Tokyo" : 27 NA's :832
## "Australia" : 26
## (Other) :486
VISUALISATION DES DONNEES:
library(plotly)
data_HEV %>%
group_by(Annee) %>%
summarise(SequenceCount = n()) %>%
plot_ly(x = ~Annee, y = ~SequenceCount, type = "scatter", mode = "lines+markers", text = ~SequenceCount) %>%
layout(title = "Évolution du Nombre de Séquences au Fil des Années",
xaxis = list(title = "Année", tickmode = "linear", tick0 = min(data_HEV$Annee), dtick = 1),
yaxis = list(title = "Nombre de Séquences"),
hovermode = "closest")
data_HEV %>%
group_by(country) %>%
summarise(Count = n()) %>%
plot_geo(locations = ~country, z = ~Count, locationmode = "country names") %>%
layout(title = "Répartition des Données par Pays")
data_HEV %>%
group_by(ORF) %>%
summarise(Count = n()) %>%
plot_ly(labels = ~ORF, values = ~Count, type = "pie") %>%
layout(title = "Distribution ORF")
data_HEV %>%
group_by(Sequencing.Technology) %>%
summarise(Count = n()) %>%
plot_ly(labels = ~Sequencing.Technology, values = ~Count, type = "pie") %>%
layout(title = "Distribution sequences en fonction des Technologies de Sequenca")
data_HEV %>%
group_by(ORF) %>%
summarise(Count = n()) %>%
plot_ly(x = ~ORF, y = ~Count, type = "bar") %>%
layout(title = "Repartition des sequences en fonction des ORF")
## Warning: Ignoring 1 observations
data_HEV %>%
group_by(SOURCE) %>%
summarise(Count = n()) %>%
plot_ly(x = ~SOURCE, y = ~Count, type = "bar") %>%
layout(title = "Repartition des sequences en fonction des souches")
data_HEV %>%
group_by(isolation_source) %>%
summarise(Count = n()) %>%
plot_ly(x = ~isolation_source, y = ~Count, type = "bar", color = "Yellow") %>%
layout(title = "Repartition par source d'isolation")
## Warning: Ignoring 1 observations
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
# Chargement des bibliothèques nécessaires
library(ggplot2)
library(plotly)
library(dplyr)
# Création d'un dataframe regroupé par Localisation pour la création du graphique interactif
data_HEV_grouped <- data_HEV %>%
group_by(Location.Qualifiers) %>%
summarise(Nombre_de_Sequences = n())
# Création d'un diagramme de dispersion interactif avec les coordonnées génomiques sur l'axe des ordonnées
plot_ly(data = data_HEV, x = ~Sequence_Length, y = ~Location.Qualifiers, type = "scatter", mode = "markers",
marker = list(size = ~data_HEV_grouped$Nombre_de_Sequences)) %>%
layout(title = "Diagramme de dispersion des coordonnées génomiques, des longueurs séquences",
xaxis = list(title = "Longueur de la séquence"),
yaxis = list(title = "Coordonnées génomiques"),
showlegend = FALSE)